1 Prepare Data

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.0     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(here)
## here() starts at /Users/Jo/OneDrive/1_Hertie Studies/Thesis/Hertie-Thesis-Mehler
library(stats)
library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(corrplot)
## corrplot 0.92 loaded
library(descr)

data <- read_csv(here("data/data_combined.csv"))
## Rows: 1079 Columns: 29
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (15): ResponseId, cluster, hate_definition, academic_status, educ_cat, g...
## dbl (14): leftright, leftright_pred_error, readability_score, text_length, t...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# select indicators
data_indicators <- data %>% 
  select(cluster, text_length_log2, readability_score, leftright_pred_error) %>% # add cluster/type indicator later
  drop_na() %>% 
  filter(readability_score < 40) # remove 2 extreme outliers

# select controls (categorical)
controls_cat <- data %>% select(academic_status, gender, age_cat, polinterest_cat_3, empathy_pc_cat, exp_hate_speech_cat, exp_hostile_engagement_cat, commitment_cat) %>% drop_na()

# select controls (numerical)
controls_num <- data %>% select(academic_status, age10, polinterest, empathy_pc, exp_hate_speech, exp_hostile_engagement, commitment_log2) %>% drop_na()

2 Correlations between Indicators

2.1 Scatter Plots and Correlation

# select indicators in a different order (continous variables first)
indicators <- c("text_length_log2", "readability_score", "leftright_pred_error", "cluster")

ggpairs(data_indicators, columns = indicators)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

2.2 Correlation Matrix (without significance)

# chose only numeric indicators
data_indicators_num <- data_indicators %>% 
  select(text_length_log2, readability_score, leftright_pred_error)

cor_matrix <- cor(data_indicators_num, method = "pearson")
cor_matrix
##                      text_length_log2 readability_score leftright_pred_error
## text_length_log2           1.00000000        0.43375846           0.02236707
## readability_score          0.43375846        1.00000000           0.09965684
## leftright_pred_error       0.02236707        0.09965684           1.00000000

2.3 Correlation Plot

cor_matrix %>% corrplot(method = "color", type = "lower", tl.cex = 1, tl.col = "black", addCoef.col = "black")

3 Correlations between Controls

3.1 Correlation Matrix (without significance)

controls_num_selection <- controls_num %>% select(-academic_status)

cor_matrix <- cor(controls_num_selection, method = "spearman")
cor_matrix
##                             age10 polinterest  empathy_pc exp_hate_speech
## age10                   1.0000000  0.22506279  0.10992547     -0.13455076
## polinterest             0.2250628  1.00000000 -0.06930434      0.08349955
## empathy_pc              0.1099255 -0.06930434  1.00000000     -0.11562140
## exp_hate_speech        -0.1345508  0.08349955 -0.11562140      1.00000000
## exp_hostile_engagement  0.1249221  0.29834697  0.03949850      0.26893914
## commitment_log2         0.1675874 -0.04808784  0.04524649     -0.18552808
##                        exp_hostile_engagement commitment_log2
## age10                              0.12492209      0.16758736
## polinterest                        0.29834697     -0.04808784
## empathy_pc                         0.03949850      0.04524649
## exp_hate_speech                    0.26893914     -0.18552808
## exp_hostile_engagement             1.00000000     -0.06618384
## commitment_log2                   -0.06618384      1.00000000

3.2 Correlation Plot

cor_matrix %>% corrplot(method = "color", type = "lower", tl.cex = 1, tl.col = "black", addCoef.col = "black")

3.3 Try Outs: Relationship between Educ_cat and my potential controls

# Relationship between EDU and Experience with Online hostile Engagement
crosstab(data$exp_hostile_engagement_cat, data$educ_cat, prop.c = TRUE)

##    Cell Contents 
## |-------------------------|
## |                   Count | 
## |          Column Percent | 
## |-------------------------|
## 
## =======================================================================
##                                    data$educ_cat
## data$exp_hostile_engagement_cat     High   Intermediate     Low   Total
## -----------------------------------------------------------------------
## Less experience                     260             83      75     418 
##                                    42.8%          33.7%   35.0%        
## -----------------------------------------------------------------------
## More experience                     347            163     139     649 
##                                    57.2%          66.3%   65.0%        
## -----------------------------------------------------------------------
## Total                               607            246     214    1067 
##                                    56.9%          23.1%   20.1%        
## =======================================================================
crosstab(data$exp_hate_speech_cat, data$educ_cat, prop.c = TRUE)

##    Cell Contents 
## |-------------------------|
## |                   Count | 
## |          Column Percent | 
## |-------------------------|
## 
## ================================================================
##                             data$educ_cat
## data$exp_hate_speech_cat     High   Intermediate     Low   Total
## ----------------------------------------------------------------
## Less experience              326            131     119     576 
##                             53.7%          53.3%   55.3%        
## ----------------------------------------------------------------
## More experience              281            115      96     492 
##                             46.3%          46.7%   44.7%        
## ----------------------------------------------------------------
## Total                        607            246     215    1068 
##                             56.8%          23.0%   20.1%        
## ================================================================
crosstab(data$polinterest_cat_3, data$educ_cat, prop.c = TRUE)

##    Cell Contents 
## |-------------------------|
## |                   Count | 
## |          Column Percent | 
## |-------------------------|
## 
## ==============================================================
##                           data$educ_cat
## data$polinterest_cat_3     High   Intermediate     Low   Total
## --------------------------------------------------------------
## High                       267             98      93     458 
##                           43.9%          39.8%   43.3%        
## --------------------------------------------------------------
## Intermediate               242            115      81     438 
##                           39.8%          46.7%   37.7%        
## --------------------------------------------------------------
## Low                         99             33      41     173 
##                           16.3%          13.4%   19.1%        
## --------------------------------------------------------------
## Total                      608            246     215    1069 
##                           56.9%          23.0%   20.1%        
## ==============================================================
crosstab(data$polinterest, data$educ_cat, prop.c = TRUE)

##    Cell Contents 
## |-------------------------|
## |                   Count | 
## |          Column Percent | 
## |-------------------------|
## 
## ========================================================
##                     data$educ_cat
## data$polinterest     High   Intermediate     Low   Total
## --------------------------------------------------------
## 1                     34              8      10      52 
##                      5.6%           3.3%    4.7%        
## --------------------------------------------------------
## 2                     65             25      31     121 
##                     10.7%          10.2%   14.4%        
## --------------------------------------------------------
## 3                    242            115      81     438 
##                     39.8%          46.7%   37.7%        
## --------------------------------------------------------
## 4                    267             98      93     458 
##                     43.9%          39.8%   43.3%        
## --------------------------------------------------------
## Total                608            246     215    1069 
##                     56.9%          23.0%   20.1%        
## ========================================================
crosstab(data$empathy_pc_cat, data$educ_cat, prop.c = TRUE)

##    Cell Contents 
## |-------------------------|
## |                   Count | 
## |          Column Percent | 
## |-------------------------|
## 
## ===========================================================
##                        data$educ_cat
## data$empathy_pc_cat     High   Intermediate     Low   Total
## -----------------------------------------------------------
## Less empathetic         340            124      99     563 
##                        56.4%          50.4%   46.3%        
## -----------------------------------------------------------
## More empathetic         263            122     115     500 
##                        43.6%          49.6%   53.7%        
## -----------------------------------------------------------
## Total                   603            246     214    1063 
##                        56.7%          23.1%   20.1%        
## ===========================================================
crosstab(data$leftright, data$educ_cat, prop.c = TRUE)

##    Cell Contents 
## |-------------------------|
## |                   Count | 
## |          Column Percent | 
## |-------------------------|
## 
## ======================================================
##                   data$educ_cat
## data$leftright     High   Intermediate     Low   Total
## ------------------------------------------------------
## 1                   26             10      13      49 
##                    4.3%           4.1%    6.1%        
## ------------------------------------------------------
## 2                   20              9      10      39 
##                    3.3%           3.7%    4.7%        
## ------------------------------------------------------
## 3                   62             12      17      91 
##                   10.3%           5.0%    8.0%        
## ------------------------------------------------------
## 4                   47             24      12      83 
##                    7.8%           9.9%    5.7%        
## ------------------------------------------------------
## 5                   63             16      23     102 
##                   10.5%           6.6%   10.8%        
## ------------------------------------------------------
## 6                  141             48      51     240 
##                   23.4%          19.8%   24.1%        
## ------------------------------------------------------
## 7                   71             35      30     136 
##                   11.8%          14.5%   14.2%        
## ------------------------------------------------------
## 8                   70             30      23     123 
##                   11.6%          12.4%   10.8%        
## ------------------------------------------------------
## 9                   47             28      11      86 
##                    7.8%          11.6%    5.2%        
## ------------------------------------------------------
## 10                  16             16       9      41 
##                    2.7%           6.6%    4.2%        
## ------------------------------------------------------
## 11                  39             14      13      66 
##                    6.5%           5.8%    6.1%        
## ------------------------------------------------------
## Total              602            242     212    1056 
##                   57.0%          22.9%   20.1%        
## ======================================================

3.4 Relationship between control variables

# Function to plot relationships between all pairs of variables in a dataframe
plot_relationships <- function(df) {
  # Initialize list to store plots
  plots <- list()
  plot_count <- 1
  col_names <- names(df)
  
  # Loop through all unique pairs of variables
  for (i in seq_len(ncol(df) - 1)) { # Exclude the last column 'gender' from the x-axis variables
    for (j in (i+1):ncol(df)) {
      x <- col_names[i]
      y <- col_names[j]
      
      # Create scatter plot for each pair
      plot <- ggplot(df, aes_string(x = x, y = y)) + 
        geom_point(position = position_jitter(width = 0.2, height = 0.2), alpha = 0.6, color = "skyblue") +
        geom_smooth(method = "lm", colour = "black", linewidth = 0.5) +
        theme_minimal() +
        labs(title = paste("Scatter plot between", x, "and", y))
      
      # Store the plot in the list
      plots[[plot_count]] <- plot
      plot_count <- plot_count + 1
    }
  }
  
  # Print plots
  lapply(plots, print)
}

3.4.1 Create scatter plots for categorical measurement

# use only categorical variables
controls_cat <- controls_cat %>% select(-academic_status)

plot_relationships(controls_cat)
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## [[1]]
## `geom_smooth()` using formula = 'y ~ x'

## 
## [[2]]
## `geom_smooth()` using formula = 'y ~ x'

## 
## [[3]]
## `geom_smooth()` using formula = 'y ~ x'

## 
## [[4]]
## `geom_smooth()` using formula = 'y ~ x'

## 
## [[5]]
## `geom_smooth()` using formula = 'y ~ x'

## 
## [[6]]
## `geom_smooth()` using formula = 'y ~ x'

## 
## [[7]]
## `geom_smooth()` using formula = 'y ~ x'

## 
## [[8]]
## `geom_smooth()` using formula = 'y ~ x'

## 
## [[9]]
## `geom_smooth()` using formula = 'y ~ x'

## 
## [[10]]
## `geom_smooth()` using formula = 'y ~ x'

## 
## [[11]]
## `geom_smooth()` using formula = 'y ~ x'

## 
## [[12]]
## `geom_smooth()` using formula = 'y ~ x'

## 
## [[13]]
## `geom_smooth()` using formula = 'y ~ x'

## 
## [[14]]
## `geom_smooth()` using formula = 'y ~ x'

## 
## [[15]]
## `geom_smooth()` using formula = 'y ~ x'

## 
## [[16]]
## `geom_smooth()` using formula = 'y ~ x'

## 
## [[17]]
## `geom_smooth()` using formula = 'y ~ x'

## 
## [[18]]
## `geom_smooth()` using formula = 'y ~ x'

## 
## [[19]]
## `geom_smooth()` using formula = 'y ~ x'

## 
## [[20]]
## `geom_smooth()` using formula = 'y ~ x'

## 
## [[21]]
## `geom_smooth()` using formula = 'y ~ x'

3.4.2 Create scatter plots for numerical measurement

controls_num <- controls_num %>% select(-academic_status)

plot_relationships(controls_num)
## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## [[1]]
## `geom_smooth()` using formula = 'y ~ x'

## 
## [[2]]
## `geom_smooth()` using formula = 'y ~ x'

## 
## [[3]]
## `geom_smooth()` using formula = 'y ~ x'

## 
## [[4]]
## `geom_smooth()` using formula = 'y ~ x'

## 
## [[5]]
## `geom_smooth()` using formula = 'y ~ x'

## 
## [[6]]
## `geom_smooth()` using formula = 'y ~ x'

## 
## [[7]]
## `geom_smooth()` using formula = 'y ~ x'

## 
## [[8]]
## `geom_smooth()` using formula = 'y ~ x'

## 
## [[9]]
## `geom_smooth()` using formula = 'y ~ x'

## 
## [[10]]
## `geom_smooth()` using formula = 'y ~ x'

## 
## [[11]]
## `geom_smooth()` using formula = 'y ~ x'

## 
## [[12]]
## `geom_smooth()` using formula = 'y ~ x'

## 
## [[13]]
## `geom_smooth()` using formula = 'y ~ x'

## 
## [[14]]
## `geom_smooth()` using formula = 'y ~ x'

## 
## [[15]]
## `geom_smooth()` using formula = 'y ~ x'